Spatial Gymnastics
- Loop and do the following for all catchment distance available
- Using
over() to determine the treso zone number for each student/school point
- Using
gbuffer() to determine the treso zones that catchment distance touches for each school
- Setting up the multi-core to do
foreach() loop is worth it once you are doing more than 100 fields
- 100 fields: 29s compared to 19s
- 1000 fields: 252s compared to 116s
- Combine the the datalist of buffered TRESO zones with the tibble containing socio-economic data
- Combine with the historical ADE and utilization data by school
#percentile_dist = c('90')
percentile_dist = c('90', '80', '70', '60', '50')
for (dist in percentile_dist) {
# Loop through the different catchment areas by retriving the proper variable.
student_travel <- get(paste0("student_travel_",dist))
# Spatial transformation to XY
student_xy <- create_student_xy(student_travel)
school_xy <- create_school_xy(student_travel)
# Create the overlays to get the TRESO zone each XY coordinate falls on
student_overlay <- create_overlay(student_xy, treso_shp, "student")
saveRDS(student_overlay, paste0("output/student_overlay_", dist, ".rds"))
school_overlay <- create_overlay(school_xy, treso_shp, "school")
# Join the POR and POS together for distance matrix
observed_por_pos <- left_join(student_overlay, school_overlay, by = "school.name") %>%
saveRDS(paste0("output/observed_por_pos_", dist, ".rds"))
# Save the zones in each school's catchment area
buffered_df <- buffer_zones(school_xy, treso_shp)
saveRDS(buffered_df, paste0("output/school_catchment_treso_zones_", dist, ".rds"))
# Combine the school's buffered TRESO zones with socio-economic info and group_by school name
school_tb <- summarize_buffered_zones(buffered_df, treso_tb, school_sfis_2017, school_board_def, treso_zone_def) %>%
saveRDS(paste0("output/school_tb_", dist, ".rds"))
}
- Test that
gBuffer() is returning intended results.
Explore Linear Regressions
- Segment the schools by board (at least)
- Build linear regression
- Explore other segmentations
# Choose the catchment area to explore
school_tb <- readRDS("output/school_tb_90.rds") %>%
mutate(shape.area_sum = shape.area_sum / 1e6) %>%
mutate(n_pop_density = n_pop_sum / shape.area_sum) %>%
mutate(n_sec_pop_density = n_sec_pop_sum / shape.area_sum) %>%
mutate(n_ele_pop_density = n_ele_pop_sum / shape.area_sum)
summary(school_tb$ade)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 229.0 356.0 427.2 541.5 1975.0
g1 <- ggplot(filter(school_tb, panel == "Secondary"), aes(x=ade)) +
geom_histogram(bins = 50) +
facet_grid(rows = vars(mof.region), cols = vars(board.type.name), scales = 'free') +
theme_minimal() +
labs(x = 'ADE', y = 'Count')
g2 <- ggplot(filter(school_tb, panel == "Secondary"), aes(x = catchment.dist, y = ade, label=school.name.x)) +
geom_point(alpha = 0.5) +
facet_grid(rows = vars(mof.region), cols = vars(board.type.name), scales = 'free') +
theme_minimal() +
labs(x = 'Catchment Distance', y = 'ADE')
g3 <- ggplot(filter(school_tb, panel == "Secondary", mof.region == "GTA"), aes(x = n_sec_pop_density, y = ade, label=school.name.x)) +
geom_point(alpha = 0.5) +
facet_grid(rows = vars(mof.region), cols = vars(board.type.name), scales = 'free') +
theme_minimal() +
labs(x = 'Secondary Population Density', y = 'ADE')
g4 <- ggplot(filter(school_tb, panel == "Secondary"), aes(x = attend_school_sum, y = ade, label=school.name.x)) +
geom_point(alpha = 0.5) +
facet_grid(rows = vars(mof.region), cols = vars(board.type.name), scales = 'free') +
theme_minimal() +
labs(x = 'Attend School', y = 'ADE')
ggplotly(g1)
ggplotly(g2)
ggplotly(g3)
ggplotly(g4)
filter(school_tb, mof.region == '0')
## # A tibble: 0 x 80
## # ... with 80 variables: sfis <int>, n_pop_sum <dbl>, n_hhlds_sum <dbl>,
## # n_ft_sum <dbl>, n_pt_sum <dbl>, n_unemp_sum <dbl>,
## # n_sec_pop_sum <dbl>, n_ele_pop_sum <dbl>, n_pre_pop_sum <dbl>,
## # n_zero_adult_sum <dbl>, n_one_adult_zero_child_sum <dbl>,
## # n_one_adult_one_child_sum <dbl>, n_one_adult_two_child_sum <dbl>,
## # n_one_adult_twoplus_child_sum <dbl>, n_two_adult_zero_child_sum <dbl>,
## # n_two_adult_one_child_sum <dbl>, n_two_adult_two_child_sum <dbl>,
## # n_two_adult_twoplus_child_sum <dbl>,
## # n_twoplus_adult_zero_child_sum <dbl>,
## # n_twoplus_adult_one_child_sum <dbl>,
## # n_twoplus_adult_two_child_sum <dbl>,
## # n_twoplus_adult_twoplus_child_sum <dbl>, occu_management_sum <dbl>,
## # occu_business_sum <dbl>, occu_science_sum <dbl>,
## # occu_health_sum <dbl>, occu_public_sum <dbl>,
## # occu_recreation_sum <dbl>, occu_sales_sum <dbl>,
## # occu_trades_sum <dbl>, occu_production_sum <dbl>,
## # occu_manufacturing_sum <dbl>, occu_notapplicable_sum <dbl>,
## # deg_none_sum <dbl>, deg_hs_sum <dbl>, deg_trades_sum <dbl>,
## # deg_ra_sum <dbl>, deg_col_sum <dbl>, deg_uni_sum <dbl>,
## # deg_ugrad_sum <dbl>, deg_grad_sum <dbl>, deg_na_sum <dbl>,
## # attend_school_sum <dbl>, shape.area_sum <dbl>, mean.income <dbl>,
## # mean.age <dbl>, dsb.index.x <dbl>, school.name.x <chr>,
## # catchment.dist <dbl>, board.type.name <chr>, area <chr>,
## # mof.region <chr>, year <int>, dsb.index.y <int>, board.name <chr>,
## # panel <chr>, school.name.y <chr>, ade <dbl>, otg <dbl>,
## # school.lat <dbl>, school.long <dbl>, status <chr>, dataset.id <int>,
## # utilization <dbl>, year.y <int>, bsid <int>, ade.forecast <dbl>,
## # ade.sec.forecast <dbl>, ade.elem.forecast <dbl>,
## # ade.jksk.forecast <dbl>, ade.g1g3.forecast <dbl>,
## # ade.g4g8.forecast <dbl>, area.m2 <dbl>, num.units.calculated <dbl>,
## # capacity.portable <dbl>, capacity.total <dbl>,
## # utilization.total <dbl>, n_pop_density <dbl>, n_sec_pop_density <dbl>,
## # n_ele_pop_density <dbl>